import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
os.getcwd()
os.chdir('/Users/aishwaryamaddimsetty/Downloads')
data = pd.read_csv("DSA.csv")
data.head()
list(data.columns)
len(data.columns)
data.info()
data.isnull().sum()
data_dup = data[data.duplicated(keep= "last")]
data_dup.shape
data.shape
data_n = data.drop_duplicates()
data_n.shape
41188-41176
data_x = data.iloc[:, :-1]
data_x.shape
data_y = data['y']
data_y
import plotly.graph_objects as pl
import plotly.express as px
fig = px.pie(df, values='tip', names='day', color='day', color_discrete_map={'Thur':'lightcyan', 'Fri':'cyan', 'Sat':'royalblue', 'Sun':'darkblue'})
target_count = data['y'].value_counts()
target_count
colors = ['Red', 'Green']
trace = pl.Pie(labels =target_count.index, values = target_count.values, pull= [0.05], marker=dict(colors=colors))
layout = pl.Layout(title = "Subscribed to the Term Deposit", height = 200, legend= dict(x=1.1, y=1.3))
fig = pl.Figure(data=[trace], layout = layout)
fig.update_layout(height=500, width=700)
fig.show()
we can say the data is imbalanced
#lien in the graph
colors = ['Red', 'Green']
trace = pl.Pie(labels =target_count.index, values = target_count.values, pull= [0.05], marker=dict(colors=colors, line=dict(color='#000000', width=2)))
layout = pl.Layout(title = "Subscribed to the Term Deposit", height = 200, legend= dict(x=1.1, y=1.3))
fig = pl.Figure(data=[trace], layout = layout)
fig.update_layout(height=500, width=700)
fig.show()
data.columns
#Building graphs to find the co relation between every two columns - for every column with the target variable ( realtion of what had closest relation to find the reault - if subscribed or not)
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.barplot(x=data['y'], y = data['age'])
Deposit = ['marital']
fig = pl.Figure(data=[
pl.Bar(name = 'Yes', x = Deposit, y = [20, 14, 23]),
pl.Bar(name='No', x=Deposit, y=[12,18,29])
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
## shows visual reprensentation of teh data's attributes and it's values
marital=['married', 'single', 'divorced','unknown']
fig = pl.Figure([pl.Bar(x=marital, y=[24928, 11568, 4612,80])])
fig.show()
data['marital'].nunique()
data['marital'].value_counts()
data_age = pd.data['age']
bins = [0]
data['age_groups'] = pd.cut(x=data['age'], bins=[19,20,29,39,49,59,69,79,89,99], labels=['teens','20s', '30s', '40s','50s', '60s','70s', '80s','90s'])
data['age_groups']
data['age'].min()
data['age'].max()
## shows visual reprensentation of teh data's attributes and it's values
deposit=['Yes', 'No']
fig = pl.Figure([pl.Bar(x=deposit, y=data['age_groups'])
fig.show()
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.barplot(x=data['y'], y=data['age'])
#plotting histogram for numerical values:
numerical_columns = data.select_dtypes(include=['int'])
numerical_columns.hist(figsize=(10,12))
#Small code to get counts of the data's catogorical values:
category_column = [i for i in data.columns if data[i].dtypes == 'object']
for column in category_column:
print(column, '\n\n')
print(data[column].value_counts())
print("------------" *4)
#Checking the data's target variable to check the data distribution in the column y:
Not_Subscribed = len(data[data['y'] == 'no'])
Subscribed = len(data[data['y']== 'yes'])
percentage_NS = (Not_Subscribed/len(data['y']))*100
percentage_Sub = (Subscribed/len(data['y']))*100
print('% of People Subscribed:', percentage_Sub)
print('% of People who dint subscribed:', percentage_NS)
data['y'].value_counts().plot.bar()
Not_Subscribed = len(data[data['y'] == 'no'])
Subscribed = len(data[data['y']== 'yes'])
Not_Subscribed
# Viz categorical data
for column in category_column:
pd.crosstab(data[column], data.y).plot(kind = 'bar')
plt.title(column)
sns.distplot(data['age'])
# Data Visualization : Individual columns grAPHS - undersatdning number of people in the categories
sns.set(style = 'ticks', color_codes= True)
sns.countplot(y='job', data = data)
data = data[data.job != 'unknown']
sns.countplot(y = 'marital', data = data)
data.marital.value_counts()
data = data[data.marital != 'Unknown']
data = data[data.loan != 'unknown']
sns.countplot(y='education', data = data)
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.barplot(x=data['y'], y=data['age'])
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.barplot(x=data['y'], y=data['campaign'])
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['job'])
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['marital'])
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['education'])
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['housing'])
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['loan'])
# using heatmap to understand the ocrelatiob between numerical values
plt.figure(figsize=(14,7))
cor = data.corr()
sns.heatmap(cor, annot = True)
plt.show()
# Was the campaign succesfull ?
(data['y'] =='yes').sum()
# Was the campaign succesfull ?
(data['y'] =='yes').value_counts()
# JOB :
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['job'])
# Easy count plots:
sns.countplot(y='job', hue= 'y', data = data)
plt.show()
# Easy count plots:
sns.countplot(x='job', hue= 'y', data = data)
plt.show()
# Marital
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['marital'])
# Easy count plots:
sns.countplot(x='marital', hue= 'y', data = data)
plt.show()
# Marital
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['default'])
# Easy count plots:
sns.countplot(x='default', hue= 'y', data = data)
plt.show()
# Marital
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['education'])
# Easy count plots:
sns.countplot(x='education', hue= 'y', data = data)
plt.show()
# Marital
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['housing'])
# Easy count plots:
sns.countplot(x='housing', hue= 'y', data = data)
plt.show()
# Marital
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['loan'])
# Easy count plots:
sns.countplot(x='loan', hue= 'y', data = data)
plt.show()
# Marital
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['contact'])
# Easy count plots:
sns.countplot(x='contact', hue= 'y', data = data)
plt.show()
# Marital
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['month'])
# Easy count plots:
sns.countplot(x='month', hue= 'y', data = data)
plt.show()
# Marital
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['day_of_week'])
# Easy count plots:
sns.countplot(x='day_of_week', hue= 'y', data = data)
plt.show()
# Marital
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['poutcome'])
# Easy count plots:
sns.countplot(x='poutcome', hue= 'y', data = data)
plt.show()
%matplotlib inline
sns.boxplot(data = data, x = 'y', y = 'age')
plt.show()
From the above boxplot we know that for both the customers that subscibed or didn't subscribe a term deposit, has a median age of around 38-40. And the boxplot for both the classes overlap quite a lot, which means that age isn't necessarily a good indicator for which customer will subscribe and which customer will not.
plt.figure(figsize=(10,8))
sns.distplot(data["age"])
As we can see in the above distribution also, that most of the customers are in the age range of 30-40.
%matplotlib inline
sns.boxplot(data=data, x="y", y="duration")
plt.show()
From the above plot it is clear that, the duration (last contact duration) of a customer can be useful for predicting the target variable. It is expected because it is already mentioned in the data overview that this field highely affects the target variable and should only be used for benchmark purposes.
plt.figure(figsize=(10,8))
sns.distplot(data["duration"])
plt.show()
This seems like a powerlaw distribution where most the values are very low and very few have high values.
%matplotlib inline
sns.boxplot(data=data, x="y", y="campaign")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["campaign"])
plt.show()
data['pdays'].unique()
data['pdays'].value_counts()
Most of the values are 999, which means that the most of the customers have never been contacted before
%matplotlib inline
sns.boxplot(data=data, x="y", y="pdays")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data[data["y"]=="yes"]["pdays"])
sns.distplot(data[data["y"]=="no"]["pdays"])
plt.show()
data["previous"].unique()
data["previous"].value_counts()
data[data["y"]=="yes"]["previous"].value_counts()
data[data["y"]=="no"]["previous"].value_counts()
%matplotlib inline
sns.boxplot(data=data, x="y", y="previous")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["previous"])
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data[data["y"]=="yes"]["previous"])
sns.distplot(data[data["y"]=="no"]["previous"])
plt.show()
The previous feature is very similarly distributed for both the classes in the target variable. From basic EDA it is not sure how much value this individual feature have on the target variable.
countplot('previous', data)
# Marital
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['previous'])
# Easy count plots:
sns.countplot(x='previous', hue= 'y', data = data)
plt.show()
data["emp.var.rate"].value_counts()
%matplotlib inline
sns.boxplot(data=data, x="y", y="emp.var.rate")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["emp.var.rate"])
plt.show()
%matplotlib inline
sns.boxplot(data=data, x="y", y="cons.price.idx")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["cons.price.idx"])
plt.show()
%matplotlib inline
sns.boxplot(data=data, x="y", y="cons.conf.idx")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["cons.conf.idx"])
plt.show()
%matplotlib inline
sns.boxplot(data=data, x="y", y="euribor3m")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["euribor3m"])
plt.show()
%matplotlib inline
sns.boxplot(data=data, x="y", y="nr.employed")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["nr.employed"])
plt.show()
https://rstudio-pubs-static.s3.amazonaws.com/581759_628a43982b744862b56256fde2d14916.html#(22)